{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8e91e08e-e1ca-4878-89ec-df53f1569667",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-07-08 11:44:47.760174: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c418ec68bdfc487b885969eac6086da7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "\n",
    "model_name = \"/home/lyz/hf-models/Qwen2.5-3B-Instruct/\"\n",
    "\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    model_name,\n",
    "    torch_dtype=\"auto\",\n",
    "    device_map=\"auto\"\n",
    ")\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "\n",
    "prompt = \"Give me a short introduction to large language model.\"\n",
    "messages = [\n",
    "    {\"role\": \"system\", \"content\": \"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\"},\n",
    "    {\"role\": \"user\", \"content\": prompt}\n",
    "]\n",
    "text = tokenizer.apply_chat_template(\n",
    "    messages,\n",
    "    tokenize=False,\n",
    "    add_generation_prompt=True\n",
    ")\n",
    "model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n",
    "\n",
    "generated_ids = model.generate(\n",
    "    **model_inputs,\n",
    "    max_new_tokens=512\n",
    ")\n",
    "generated_ids = [\n",
    "    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
    "]\n",
    "\n",
    "response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "53048ef3-c88c-4c75-b89e-1c316db86c90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Certainly! A large language model (LLM) is a type of artificial intelligence designed to understand and generate human-like text based on vast amounts of training data. These models are trained using machine learning algorithms to recognize patterns and relationships within the text they encounter during training. \\n\\nKey characteristics of large language models include:\\n\\n1. **Size**: They are typically trained on enormous datasets, often consisting of billions or trillions of words.\\n2. **Complexity**: They have complex architectures that allow them to handle various types of language tasks, including translation, summarization, question-answering, and even creative writing.\\n3. **Interactivity**: They can interact with users in real-time, responding to queries and generating text dynamically.\\n4. **Adaptability**: They can be fine-tuned for specific domains or applications, making them highly versatile.\\n\\nLarge language models like myself are capable of understanding context, reasoning about information, and producing coherent and relevant text, which makes them valuable tools across multiple industries, from technology and entertainment to education and customer service.'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "adda1af7-9199-4e2b-9e50-3fb1144c3a4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "PROMPT_TEMPLATE = \"\"\"你是一个专业的植物命名实体识别专家，请识别句子中实体：@{0}@\n",
    "\n",
    "\n",
    "其中需要识别的实体如下：\n",
    "- 植物 (plant)：植物的通用名称或俗称，如细瘦糯米条/木里秋葵/察隅冷杉。\n",
    "- 保护等级 (class)：植物受到的保护级别，如国家级、省级、濒危、二级等\n",
    "- 生长环境 (environment)：描述植物生长的自然环境、土壤条件或生态位，如沼泽/冻原/平原/芦苇丛/季雨林/峡谷/苔藓地。\n",
    "- 分布地区 (area)：植物在地理上的分布区域，如巴西/中国/印度/阿富汗。\n",
    "- 海拔 (altitude)：植物主要生长的海拔范围，如海拔4200米。\n",
    "\n",
    "识别的实体需要来自原始句子，并且是连续的单词，如果存在多个实体则存为列表。识别结果填充为如下json，不需要有其他任何输出\n",
    "{\n",
    "    \"plant\": [],\n",
    "    \"class\": [],\n",
    "    \"environment\": [],\n",
    "    \"area\": [],\n",
    "    \"altitude\": []\n",
    "}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a46748aa-af5c-4e0c-afbf-7984e297b940",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "for line in open(\"test.jsonl\").readlines():\n",
    "    origin_text = json.loads(line)[\"text\"]\n",
    "    result_label = {}\n",
    "\n",
    "    try:\n",
    "        messages = [\n",
    "            {\"role\": \"user\", \"content\": PROMPT_TEMPLATE.replace(\"@{0}@\", origin_text)}\n",
    "        ]\n",
    "        text = tokenizer.apply_chat_template(\n",
    "            messages,\n",
    "            tokenize=False,\n",
    "            add_generation_prompt=True\n",
    "        )\n",
    "        model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n",
    "        \n",
    "        generated_ids = model.generate(\n",
    "            **model_inputs,\n",
    "            max_new_tokens=512\n",
    "        )\n",
    "        generated_ids = [\n",
    "            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
    "        ]\n",
    "        \n",
    "        result = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
    "        try:\n",
    "            result = json.loads(result)\n",
    "        except:\n",
    "            result = {}\n",
    "    \n",
    "        for category in result.keys():\n",
    "            if len(result[category]) > 0:\n",
    "                for word in result[category]:\n",
    "                    if word in origin_text:\n",
    "                        if category not in result_label:\n",
    "                            result_label[category] = {}\n",
    "    \n",
    "                        result_label[category][word] = [[origin_text.index(word), origin_text.index(word)+len(word)-1]]\n",
    "    except:\n",
    "        pass\n",
    "\n",
    "    with open(\"submit.jsonl\", \"a\") as up:\n",
    "        up.write(json.dumps({\n",
    "            \"text\": origin_text,\n",
    "            \"label\": result_label\n",
    "        }, ensure_ascii=False) + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6532c5f3-53f4-4088-9043-e31fccf4940b",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'origin_text' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43morigin_text\u001b[49m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'origin_text' is not defined"
     ]
    }
   ],
   "source": [
    "\n",
    "origin_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "264e4143-2a90-41f4-8839-dfbbd1d2be4a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'苔藓地'"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42c75ba3-19ac-4d02-90bb-7e2aa8cc1f3f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9db1498a-f422-45ea-9b18-368b0acf51a1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py3.11",
   "language": "python",
   "name": "py3.11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
